library(rpart) # to train decision trees
library(rpart.plot) # to plot decision trees
library(randomForest) # random forests
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library(gbm) # boosting
## Loaded gbm 2.1.8
library(tidyverse) # tidyverse
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.1.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::combine() masks randomForest::combine()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x ggplot2::margin() masks randomForest::margin()
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(cowplot)
library(ggplot2)
library(class)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-3
library(glmnetUtils)
##
## Attaching package: 'glmnetUtils'
## The following objects are masked from 'package:glmnet':
##
## cv.glmnet, glmnet
library(corrplot)
## corrplot 0.92 loaded
rawfa <- read_csv('Food Access Research Atlas.csv')
## Warning: One or more parsing issues, see `problems()` for details
## Rows: 72531 Columns: 147
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (113): State, County, MedianFamilyIncome, LAPOP1_10, LAPOP05_10, LAPOP1_...
## dbl (34): CensusTract, Urban, Pop2010, OHU2010, GroupQuartersFlag, NUMGQTRS...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
rawtract <- read_csv('pdb2019bgv6_us.csv')
## Rows: 220357 Columns: 346
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (22): GIDBG, State, State_name, County, County_name, Tract, Med_HHD_Inc...
## dbl (324): Block_group, Flag, LAND_AREA, AIAN_LAND, URBANIZED_AREA_POP_CEN_2...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
modtract <- rawtract[is.na(rawtract$Flag),] # removes uninhabitable areas
modtract1 <- modtract[!(is.na(modtract$Tot_Population_CEN_2010)|modtract$Tot_Population_CEN_2010==0),]
# removes areas with no residents
# Now, time to isolate the metrics that we actually care about
modtract2 <- modtract1 %>%
select(GIDBG,Tract,Block_group,State_name,County_name,
Tot_Population_CEN_2010,
pct_URBAN_CLUSTER_POP_CEN_2010,
pct_URBANIZED_AREA_POP_CEN_2010,
pct_RURAL_POP_CEN_2010,
pct_Males_CEN_2010,
pct_Pop_under_5_CEN_2010,
pct_Pop_5_17_CEN_2010,
pct_Pop_18_24_CEN_2010,
pct_Pop_25_44_CEN_2010,
pct_Pop_45_64_CEN_2010,
pct_Pop_65plus_CEN_2010,
pct_Tot_GQ_CEN_2010,
pct_Hispanic_CEN_2010,
pct_NH_White_alone_CEN_2010,
pct_NH_Blk_alone_CEN_2010,
pct_NH_AIAN_alone_CEN_2010,
pct_NH_Asian_alone_CEN_2010,
pct_NH_NHOPI_alone_CEN_2010,
pct_Not_HS_Grad_ACS_13_17,
pct_College_ACS_13_17,
pct_Prs_Blw_Pov_Lev_ACS_13_17,
pct_No_Health_Ins_ACS_13_17,
pct_Diff_HU_1yr_Ago_ACS_13_17,
pct_Rel_Family_HHD_CEN_2010,
pct_MrdCple_HHD_CEN_2010,
pct_Female_No_HB_CEN_2010,
pct_Sngl_Prns_HHD_CEN_2010,
pct_HHD_PPL_Und_18_CEN_2010,
avg_Tot_Prns_in_HHD_CEN_2010,
pct_PUB_ASST_INC_ACS_13_17,
avg_Agg_HH_INC_ACS_13_17,
pct_Vacant_Units_CEN_2010,
pct_Renter_Occp_HU_CEN_2010,
pct_MLT_U2_9_STRC_ACS_13_17,
pct_MLT_U10p_ACS_13_17,
pct_Mobile_Homes_ACS_13_17,
pct_NO_PH_SRVC_ACS_13_17,
pct_No_Plumb_ACS_13_17,
pct_Recent_Built_HU_ACS_13_17,
avg_Agg_House_Value_ACS_13_17) %>%
rename_with(tolower)
nrow(modtract2)-length(unique(modtract2$tract)) #There are 195505 repetitions of Tracts
## [1] 195505
# This repetition is caused by the fact there are multiple "blocks" within each Tract
# To be able to view the data at a Tract level (so we can combine with the Food Access data),
# we need to group by Tract and find the weighted avg of the blocks' data for some variables
# and the sum for others.
# Upon further research, there are also multiple tracts with the same number across states.
# Thus, we will be using gidbg as an id for each census tract. This is more in line with
# the other data set as well.
modtract3 <- modtract2 %>% separate(gidbg,into=c('gidbg','block'),sep=-1) %>% select(-block)
fintract <- na.omit(modtract3) %>%
group_by(gidbg) %>%
mutate(total_pop=sum(tot_population_cen_2010)) %>%
mutate(pct_urban=weighted.mean(pct_urban_cluster_pop_cen_2010,tot_population_cen_2010)) %>%
select(-pct_urban_cluster_pop_cen_2010) %>%
mutate(pct_urbanized=weighted.mean(pct_urbanized_area_pop_cen_2010,tot_population_cen_2010)) %>%
select(-pct_urbanized_area_pop_cen_2010) %>%
mutate(pct_rural=weighted.mean(pct_rural_pop_cen_2010,tot_population_cen_2010)) %>%
select(-pct_rural_pop_cen_2010) %>%
mutate(pct_male=weighted.mean(pct_males_cen_2010,tot_population_cen_2010)) %>%
select(-pct_males_cen_2010) %>%
mutate(pct_under_5=weighted.mean(pct_pop_under_5_cen_2010,tot_population_cen_2010)) %>%
select(-pct_pop_under_5_cen_2010) %>%
mutate(pct_5_17=weighted.mean(pct_pop_5_17_cen_2010,tot_population_cen_2010)) %>%
select(-pct_pop_5_17_cen_2010) %>%
mutate(pct_18_24=weighted.mean(pct_pop_18_24_cen_2010,tot_population_cen_2010)) %>%
select(-pct_pop_18_24_cen_2010) %>%
mutate(pct_25_44=weighted.mean(pct_pop_25_44_cen_2010,tot_population_cen_2010)) %>%
select(-pct_pop_25_44_cen_2010) %>%
mutate(pct_45_64=weighted.mean(pct_pop_45_64_cen_2010,tot_population_cen_2010)) %>%
select(-pct_pop_45_64_cen_2010) %>%
mutate(pct_65plus=weighted.mean(pct_pop_65plus_cen_2010,tot_population_cen_2010)) %>%
select(-pct_pop_65plus_cen_2010) %>%
mutate(pct_group_quarters=weighted.mean(pct_tot_gq_cen_2010,tot_population_cen_2010)) %>%
select(-pct_tot_gq_cen_2010) %>%
mutate(pct_hispanic=weighted.mean(pct_hispanic_cen_2010,tot_population_cen_2010)) %>%
select(-pct_hispanic_cen_2010) %>%
mutate(pct_white=weighted.mean(pct_nh_white_alone_cen_2010,tot_population_cen_2010)) %>%
select(-pct_nh_white_alone_cen_2010) %>%
mutate(pct_black=weighted.mean(pct_nh_blk_alone_cen_2010,tot_population_cen_2010)) %>%
select(-pct_nh_blk_alone_cen_2010) %>%
mutate(pct_aian=weighted.mean(pct_nh_aian_alone_cen_2010,tot_population_cen_2010)) %>%
select(-pct_nh_aian_alone_cen_2010) %>%
mutate(pct_asian=weighted.mean(pct_nh_asian_alone_cen_2010,tot_population_cen_2010)) %>%
select(-pct_nh_asian_alone_cen_2010) %>%
mutate(pct_nhopi=weighted.mean(pct_nh_nhopi_alone_cen_2010,tot_population_cen_2010)) %>%
select(-pct_nh_nhopi_alone_cen_2010) %>%
mutate(pct_not_hs_grad=weighted.mean(pct_not_hs_grad_acs_13_17,tot_population_cen_2010)) %>%
select(-pct_not_hs_grad_acs_13_17) %>%
mutate(pct_college=weighted.mean(pct_college_acs_13_17,tot_population_cen_2010)) %>%
select(-pct_college_acs_13_17) %>%
mutate(pct_blw_pov=weighted.mean(pct_prs_blw_pov_lev_acs_13_17,tot_population_cen_2010)) %>%
select(-pct_prs_blw_pov_lev_acs_13_17) %>%
mutate(pct_no_h_ins=weighted.mean(pct_no_health_ins_acs_13_17,tot_population_cen_2010)) %>%
select(-pct_no_health_ins_acs_13_17) %>%
mutate(pct_moved=weighted.mean(pct_diff_hu_1yr_ago_acs_13_17,tot_population_cen_2010)) %>%
select(-pct_diff_hu_1yr_ago_acs_13_17) %>%
mutate(pct_family_hhd=weighted.mean(pct_rel_family_hhd_cen_2010,tot_population_cen_2010)) %>%
select(-pct_rel_family_hhd_cen_2010) %>%
mutate(pct_married_hhd=weighted.mean(pct_mrdcple_hhd_cen_2010,tot_population_cen_2010)) %>%
select(-pct_mrdcple_hhd_cen_2010) %>%
mutate(pct_single_female=weighted.mean(pct_female_no_hb_cen_2010,tot_population_cen_2010)) %>%
select(-pct_female_no_hb_cen_2010) %>%
mutate(pct_solo_res=weighted.mean(pct_sngl_prns_hhd_cen_2010,tot_population_cen_2010)) %>%
select(-pct_sngl_prns_hhd_cen_2010) %>%
mutate(pct_hhd_children=weighted.mean(pct_hhd_ppl_und_18_cen_2010,tot_population_cen_2010)) %>%
select(-pct_hhd_ppl_und_18_cen_2010) %>%
mutate(avg_ppl_hhd=weighted.mean(avg_tot_prns_in_hhd_cen_2010,tot_population_cen_2010)) %>%
select(-avg_tot_prns_in_hhd_cen_2010) %>%
mutate(pct_pub_assist=weighted.mean(pct_pub_asst_inc_acs_13_17,tot_population_cen_2010)) %>%
select(-pct_pub_asst_inc_acs_13_17) %>%
mutate(interim_income=as.numeric(gsub("[\\$,]", "", avg_agg_hh_inc_acs_13_17))) %>%
# ^ adjusting to be numeric
mutate(avg_hhd_income=weighted.mean(interim_income,tot_population_cen_2010)) %>%
select(-avg_agg_hh_inc_acs_13_17) %>%
select(-interim_income) %>%
mutate(pct_vacant_units=weighted.mean(pct_vacant_units_cen_2010,tot_population_cen_2010)) %>%
select(-pct_vacant_units_cen_2010) %>%
mutate(pct_vacant_units=weighted.mean(pct_renter_occp_hu_cen_2010,tot_population_cen_2010)) %>%
select(-pct_renter_occp_hu_cen_2010) %>%
mutate(pct_multi_2_9=weighted.mean(pct_mlt_u2_9_strc_acs_13_17,tot_population_cen_2010)) %>%
select(-pct_mlt_u2_9_strc_acs_13_17) %>%
mutate(pct_multi_10plus=weighted.mean(pct_mlt_u10p_acs_13_17,tot_population_cen_2010)) %>%
select(-pct_mlt_u10p_acs_13_17) %>%
mutate(pct_mobile_homes=weighted.mean(pct_mobile_homes_acs_13_17,tot_population_cen_2010)) %>%
select(-pct_mobile_homes_acs_13_17) %>%
mutate(pct_no_service=weighted.mean(pct_no_ph_srvc_acs_13_17,tot_population_cen_2010)) %>%
select(-pct_no_ph_srvc_acs_13_17) %>%
mutate(pct_no_plumbing=weighted.mean(pct_no_plumb_acs_13_17,tot_population_cen_2010)) %>%
select(-pct_no_plumb_acs_13_17) %>%
mutate(pct_new_house=weighted.mean(pct_recent_built_hu_acs_13_17,tot_population_cen_2010)) %>%
select(-pct_recent_built_hu_acs_13_17) %>%
mutate(interim_house_value=as.numeric(gsub("[\\$,]", "", avg_agg_house_value_acs_13_17))) %>%
# ^ adjusting to be numeric
mutate(avg_house_value=weighted.mean(interim_house_value,tot_population_cen_2010)) %>%
select(-avg_agg_house_value_acs_13_17) %>%
select(-interim_house_value) %>%
# now, to take out some columns we only still needed for computation
select(-c(tot_population_cen_2010,block_group,tract)) %>%
distinct(gidbg,total_pop,pct_urban,.keep_all=TRUE)
rawfa1 <- rawfa %>% rename_with(tolower) %>% # selecting only important variables
select(censustract,state,county,la1and10,lahalfand10, # (both explanatory and response)
la1and20,latracts_half,latracts1,latracts10,
latracts20,latractsvehicle_20,lapophalfshare,
lakidshalf,laseniorshalf,lahunvhalfshare,lapop1share,
lakids1,laseniors1,lahunv1share,lapop10share,lakids10,
laseniors10,lahunv10share,lapop20share,lakids20,
laseniors20,lahunv20share,tractkids,tractseniors,tractsnap,
pop2010)
# the data only gave the # of kids/seniors with low access, as well as what percent of the
# total population they comprise, but not what percent OF kids/seniors are low access
finfa <- rawfa1 %>%
mutate(snap_share=tractsnap/pop2010*100) %>% # calculating % on SNAP
select(-c(tractsnap,pop2010)) %>%
mutate(tot_share_half=as.numeric(lapophalfshare)) %>% # making columns not needing
select(-lapophalfshare) %>% # changing numeric
mutate(tot_share_1=as.numeric(lapop1share)) %>% # also manages order
select(-lapop1share) %>%
mutate(tot_share_10=as.numeric(lapop10share)) %>%
select(-lapop10share) %>%
mutate(tot_share_20=as.numeric(lapop20share)) %>%
select(-lapop20share) %>%
mutate(no_vehic_half=as.numeric(lahunvhalfshare)) %>%
select(-lahunvhalfshare) %>%
mutate(no_vehic_1=as.numeric(lahunv1share)) %>%
select(-lahunv1share) %>%
mutate(no_vehic_10=as.numeric(lahunv10share)) %>%
select(-lahunv10share) %>%
mutate(no_vehic_20=as.numeric(lahunv20share)) %>%
select(-lahunv20share) %>%
mutate(kids_share_half=as.numeric(lakidshalf)/tractkids) %>% #changes
select(-lakidshalf) %>%
mutate(seniors_share_half=as.numeric(laseniorshalf)/tractseniors) %>%
select(-laseniorshalf) %>%
mutate(kids_share_1=as.numeric(lakids1)/tractkids) %>%
select(-lakids1) %>%
mutate(seniors_share_1=as.numeric(laseniors1)/tractseniors) %>%
select(-laseniors1) %>%
mutate(kids_share_10=as.numeric(lakids10)/tractkids) %>%
select(-lakids10) %>%
mutate(seniors_share_10=as.numeric(laseniors10)/tractseniors) %>%
select(-laseniors10) %>%
mutate(kids_share_20=as.numeric(lakids20)/tractkids) %>%
select(-lakids20) %>%
mutate(seniors_share_20=as.numeric(laseniors20)/tractseniors) %>%
select(-laseniors20) %>%
select(-c(tractkids,tractseniors,state,county))
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
finfa$censustract <- as.numeric(finfa$censustract)
fintract$gidbg <- as.numeric(fintract$gidbg)
finfa <- finfa %>% rename(tract=censustract)
fintract <- fintract %>% rename(tract=gidbg)
us_food_access <- inner_join(fintract,finfa,by='tract')
us_food_access <- us_food_access[!(us_food_access$tract==46102940900 |
us_food_access$tract==2158000100 |
us_food_access$tract==46102940500),]
Analysis
set.seed(250)
train_samples <- sample(1:nrow(us_food_access),0.8*nrow(us_food_access))
food_train <- us_food_access[train_samples,]
food_test <- us_food_access[-train_samples,]
glm_fit_10 <- glm(latracts10 ~
pct_urban+pct_urbanized+pct_rural+
pct_male+pct_under_5+
pct_5_17+pct_18_24+
pct_25_44+pct_45_64+
pct_65plus+pct_group_quarters+
pct_hispanic+pct_white+
pct_black+pct_aian+
pct_asian+pct_nhopi+
pct_not_hs_grad+pct_college+
pct_blw_pov+pct_no_h_ins+
pct_moved+pct_family_hhd+
pct_single_female+
pct_solo_res+pct_hhd_children+
avg_ppl_hhd+pct_pub_assist+
avg_hhd_income+
pct_vacant_units+
pct_multi_2_9+
pct_multi_10plus+
pct_mobile_homes+
pct_no_service+
pct_no_plumbing+pct_new_house+
avg_house_value+snap_share,
family='binomial',
data=food_train)
summary(glm_fit_10)
##
## Call:
## glm(formula = latracts10 ~ pct_urban + pct_urbanized + pct_rural +
## pct_male + pct_under_5 + pct_5_17 + pct_18_24 + pct_25_44 +
## pct_45_64 + pct_65plus + pct_group_quarters + pct_hispanic +
## pct_white + pct_black + pct_aian + pct_asian + pct_nhopi +
## pct_not_hs_grad + pct_college + pct_blw_pov + pct_no_h_ins +
## pct_moved + pct_family_hhd + pct_single_female + pct_solo_res +
## pct_hhd_children + avg_ppl_hhd + pct_pub_assist + avg_hhd_income +
## pct_vacant_units + pct_multi_2_9 + pct_multi_10plus + pct_mobile_homes +
## pct_no_service + pct_no_plumbing + pct_new_house + avg_house_value +
## snap_share, family = "binomial", data = food_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5508 -0.0417 -0.0091 -0.0042 4.0012
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.753e+04 1.435e+04 -1.222 0.221727
## pct_urban 1.702e+02 1.434e+02 1.187 0.235178
## pct_urbanized 1.702e+02 1.434e+02 1.187 0.235264
## pct_rural 1.703e+02 1.434e+02 1.187 0.235076
## pct_male 2.245e-01 2.263e-02 9.923 < 2e-16 ***
## pct_under_5 5.195e+00 5.607e+00 0.927 0.354138
## pct_5_17 5.029e+00 5.606e+00 0.897 0.369725
## pct_18_24 4.843e+00 5.606e+00 0.864 0.387557
## pct_25_44 4.772e+00 5.606e+00 0.851 0.394626
## pct_45_64 4.877e+00 5.606e+00 0.870 0.384357
## pct_65plus 4.892e+00 5.606e+00 0.873 0.382883
## pct_group_quarters -2.161e-02 1.192e-02 -1.812 0.069935 .
## pct_hispanic 7.385e-02 2.414e-02 3.059 0.002221 **
## pct_white 5.059e-02 2.391e-02 2.116 0.034373 *
## pct_black 9.979e-02 2.453e-02 4.069 4.73e-05 ***
## pct_aian 1.074e-01 2.587e-02 4.149 3.33e-05 ***
## pct_asian 9.916e-02 3.840e-02 2.582 0.009820 **
## pct_nhopi 2.323e-01 8.157e-02 2.848 0.004398 **
## pct_not_hs_grad -1.490e-02 6.273e-03 -2.376 0.017523 *
## pct_college -7.181e-03 5.346e-03 -1.343 0.179201
## pct_blw_pov 8.457e-03 5.989e-03 1.412 0.157886
## pct_no_h_ins 1.130e-03 6.014e-03 0.188 0.850998
## pct_moved 8.725e-03 6.380e-03 1.368 0.171442
## pct_family_hhd 6.418e-02 2.384e-02 2.692 0.007093 **
## pct_single_female -1.865e-01 1.779e-02 -10.484 < 2e-16 ***
## pct_solo_res 6.858e-02 2.801e-02 2.448 0.014346 *
## pct_hhd_children 2.268e-02 2.164e-02 1.048 0.294684
## avg_ppl_hhd -3.087e+00 4.460e-01 -6.922 4.47e-12 ***
## pct_pub_assist 2.525e-02 1.459e-02 1.730 0.083580 .
## avg_hhd_income -1.065e-05 3.093e-06 -3.444 0.000574 ***
## pct_vacant_units 1.583e-02 6.057e-03 2.614 0.008951 **
## pct_multi_2_9 -4.308e-02 9.864e-03 -4.367 1.26e-05 ***
## pct_multi_10plus -6.160e-02 1.484e-02 -4.150 3.32e-05 ***
## pct_mobile_homes 1.718e-02 2.798e-03 6.139 8.29e-10 ***
## pct_no_service -1.201e-02 1.113e-02 -1.079 0.280524
## pct_no_plumbing 8.188e-02 6.457e-03 12.680 < 2e-16 ***
## pct_new_house 4.328e-02 2.096e-02 2.065 0.038880 *
## avg_house_value -5.415e-07 5.412e-07 -1.001 0.317033
## snap_share -8.014e-02 1.597e-02 -5.019 5.18e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 20675.1 on 56116 degrees of freedom
## Residual deviance: 9624.7 on 56078 degrees of freedom
## AIC: 9702.7
##
## Number of Fisher Scoring iterations: 11
fitted_probabilities <- predict(glm_fit_10,
newdata=food_test,type="response")
hist(fitted_probabilities)

predictions <- fitted_probabilities>0.5
food_test <- food_test %>%
ungroup() %>%
mutate(predicted_flag=predictions)
# misclassification rate
food_test %>% summarise(mean(latracts10!=predicted_flag))
food_test %>%
select(latracts10,predicted_flag) %>%
table()
## predicted_flag
## latracts10 FALSE TRUE
## 0 13314 105
## 1 387 224
C_FN<- 53000000000/nrow(us_food_access)+1000000 # total cost of food inequity / # of tracts
C_FP <- 500000 # cost of building supermarket
thresh <- C_FP/(C_FP+C_FN)
thresh
## [1] 0.2216748
predictions <- as.numeric(fitted_probabilities>thresh)
food_test %>%
mutate(predicted_flag=predictions) %>%
select(latracts10,predicted_flag) %>%
table()
## predicted_flag
## latracts10 0 1
## 0 12845 574
## 1 142 469
# ROC curve
roc_log <- roc(food_test %>% pull(latracts10),
fitted_probabilities)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
fpr <- 1-roc_log$specificities
tpr <- roc_log$sensitivities
tibble(fpr,tpr) %>%
ggplot(aes(x=fpr,y=tpr)) +
geom_line() +
geom_abline(slope=1,linetype='dashed') +
theme_bw()

roc_log$auc
## Area under the curve: 0.9698
food_train %>%
ggplot(aes(x=pct_college,y=latracts10)) +
geom_jitter(height=0.05) +
geom_smooth(method='glm',
formula='y~x',
method.args=list(family='binomial'),
se=F) +
ylab("Prob(flagged for having low food access") +
geom_hline(yintercept=thresh,linetype='dashed',color='red')

theme_bw()
## List of 93
## $ line :List of 6
## ..$ colour : chr "black"
## ..$ size : num 0.5
## ..$ linetype : num 1
## ..$ lineend : chr "butt"
## ..$ arrow : logi FALSE
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_line" "element"
## $ rect :List of 5
## ..$ fill : chr "white"
## ..$ colour : chr "black"
## ..$ size : num 0.5
## ..$ linetype : num 1
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_rect" "element"
## $ text :List of 11
## ..$ family : chr ""
## ..$ face : chr "plain"
## ..$ colour : chr "black"
## ..$ size : num 11
## ..$ hjust : num 0.5
## ..$ vjust : num 0.5
## ..$ angle : num 0
## ..$ lineheight : num 0.9
## ..$ margin : 'margin' num [1:4] 0points 0points 0points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : logi FALSE
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ title : NULL
## $ aspect.ratio : NULL
## $ axis.title : NULL
## $ axis.title.x :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 1
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 2.75points 0points 0points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.title.x.top :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 0
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0points 0points 2.75points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.title.x.bottom : NULL
## $ axis.title.y :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 1
## ..$ angle : num 90
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0points 2.75points 0points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.title.y.left : NULL
## $ axis.title.y.right :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 0
## ..$ angle : num -90
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0points 0points 0points 2.75points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : chr "grey30"
## ..$ size : 'rel' num 0.8
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text.x :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 1
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 2.2points 0points 0points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text.x.top :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 0
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0points 0points 2.2points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text.x.bottom : NULL
## $ axis.text.y :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : num 1
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0points 2.2points 0points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text.y.left : NULL
## $ axis.text.y.right :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : num 0
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0points 0points 0points 2.2points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.ticks :List of 6
## ..$ colour : chr "grey20"
## ..$ size : NULL
## ..$ linetype : NULL
## ..$ lineend : NULL
## ..$ arrow : logi FALSE
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_line" "element"
## $ axis.ticks.x : NULL
## $ axis.ticks.x.top : NULL
## $ axis.ticks.x.bottom : NULL
## $ axis.ticks.y : NULL
## $ axis.ticks.y.left : NULL
## $ axis.ticks.y.right : NULL
## $ axis.ticks.length : 'simpleUnit' num 2.75points
## ..- attr(*, "unit")= int 8
## $ axis.ticks.length.x : NULL
## $ axis.ticks.length.x.top : NULL
## $ axis.ticks.length.x.bottom: NULL
## $ axis.ticks.length.y : NULL
## $ axis.ticks.length.y.left : NULL
## $ axis.ticks.length.y.right : NULL
## $ axis.line : list()
## ..- attr(*, "class")= chr [1:2] "element_blank" "element"
## $ axis.line.x : NULL
## $ axis.line.x.top : NULL
## $ axis.line.x.bottom : NULL
## $ axis.line.y : NULL
## $ axis.line.y.left : NULL
## $ axis.line.y.right : NULL
## $ legend.background :List of 5
## ..$ fill : NULL
## ..$ colour : logi NA
## ..$ size : NULL
## ..$ linetype : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_rect" "element"
## $ legend.margin : 'margin' num [1:4] 5.5points 5.5points 5.5points 5.5points
## ..- attr(*, "unit")= int 8
## $ legend.spacing : 'simpleUnit' num 11points
## ..- attr(*, "unit")= int 8
## $ legend.spacing.x : NULL
## $ legend.spacing.y : NULL
## $ legend.key :List of 5
## ..$ fill : chr "white"
## ..$ colour : logi NA
## ..$ size : NULL
## ..$ linetype : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_rect" "element"
## $ legend.key.size : 'simpleUnit' num 1.2lines
## ..- attr(*, "unit")= int 3
## $ legend.key.height : NULL
## $ legend.key.width : NULL
## $ legend.text :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : 'rel' num 0.8
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ legend.text.align : NULL
## $ legend.title :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : num 0
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ legend.title.align : NULL
## $ legend.position : chr "right"
## $ legend.direction : NULL
## $ legend.justification : chr "center"
## $ legend.box : NULL
## $ legend.box.just : NULL
## $ legend.box.margin : 'margin' num [1:4] 0cm 0cm 0cm 0cm
## ..- attr(*, "unit")= int 1
## $ legend.box.background : list()
## ..- attr(*, "class")= chr [1:2] "element_blank" "element"
## $ legend.box.spacing : 'simpleUnit' num 11points
## ..- attr(*, "unit")= int 8
## $ panel.background :List of 5
## ..$ fill : chr "white"
## ..$ colour : logi NA
## ..$ size : NULL
## ..$ linetype : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_rect" "element"
## $ panel.border :List of 5
## ..$ fill : logi NA
## ..$ colour : chr "grey20"
## ..$ size : NULL
## ..$ linetype : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_rect" "element"
## $ panel.spacing : 'simpleUnit' num 5.5points
## ..- attr(*, "unit")= int 8
## $ panel.spacing.x : NULL
## $ panel.spacing.y : NULL
## $ panel.grid :List of 6
## ..$ colour : chr "grey92"
## ..$ size : NULL
## ..$ linetype : NULL
## ..$ lineend : NULL
## ..$ arrow : logi FALSE
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_line" "element"
## $ panel.grid.major : NULL
## $ panel.grid.minor :List of 6
## ..$ colour : NULL
## ..$ size : 'rel' num 0.5
## ..$ linetype : NULL
## ..$ lineend : NULL
## ..$ arrow : logi FALSE
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_line" "element"
## $ panel.grid.major.x : NULL
## $ panel.grid.major.y : NULL
## $ panel.grid.minor.x : NULL
## $ panel.grid.minor.y : NULL
## $ panel.ontop : logi FALSE
## $ plot.background :List of 5
## ..$ fill : NULL
## ..$ colour : chr "white"
## ..$ size : NULL
## ..$ linetype : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_rect" "element"
## $ plot.title :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : 'rel' num 1.2
## ..$ hjust : num 0
## ..$ vjust : num 1
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0points 0points 5.5points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ plot.title.position : chr "panel"
## $ plot.subtitle :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : num 0
## ..$ vjust : num 1
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0points 0points 5.5points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ plot.caption :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : 'rel' num 0.8
## ..$ hjust : num 1
## ..$ vjust : num 1
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 5.5points 0points 0points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ plot.caption.position : chr "panel"
## $ plot.tag :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : 'rel' num 1.2
## ..$ hjust : num 0.5
## ..$ vjust : num 0.5
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ plot.tag.position : chr "topleft"
## $ plot.margin : 'margin' num [1:4] 5.5points 5.5points 5.5points 5.5points
## ..- attr(*, "unit")= int 8
## $ strip.background :List of 5
## ..$ fill : chr "grey85"
## ..$ colour : chr "grey20"
## ..$ size : NULL
## ..$ linetype : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_rect" "element"
## $ strip.background.x : NULL
## $ strip.background.y : NULL
## $ strip.placement : chr "inside"
## $ strip.text :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : chr "grey10"
## ..$ size : 'rel' num 0.8
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 4.4points 4.4points 4.4points 4.4points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ strip.text.x : NULL
## $ strip.text.y :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : num -90
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ strip.switch.pad.grid : 'simpleUnit' num 2.75points
## ..- attr(*, "unit")= int 8
## $ strip.switch.pad.wrap : 'simpleUnit' num 2.75points
## ..- attr(*, "unit")= int 8
## $ strip.text.y.left :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : num 90
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## - attr(*, "class")= chr [1:2] "theme" "gg"
## - attr(*, "complete")= logi TRUE
## - attr(*, "validate")= logi TRUE
train1 <- food_train %>% ungroup() %>% select(pct_urban,
pct_urbanized,pct_rural,
pct_male,pct_under_5,
pct_5_17,pct_18_24,
pct_25_44,pct_45_64,
pct_65plus,pct_group_quarters,
pct_hispanic,pct_white,
pct_black,pct_aian,
pct_asian,pct_nhopi,
pct_not_hs_grad,pct_college,
pct_blw_pov,pct_no_h_ins,
pct_moved,pct_family_hhd,
pct_single_female,
pct_solo_res,pct_hhd_children,
avg_ppl_hhd,pct_pub_assist,
avg_hhd_income,
pct_vacant_units,
pct_multi_2_9,
pct_multi_10plus,
pct_mobile_homes,
pct_no_service,
pct_no_plumbing,pct_new_house,
avg_house_value,snap_share)
test1 <- food_test %>% select(pct_urban,
pct_urbanized,pct_rural,
pct_male,pct_under_5,
pct_5_17,pct_18_24,
pct_25_44,pct_45_64,
pct_65plus,pct_group_quarters,
pct_hispanic,pct_white,
pct_black,pct_aian,
pct_asian,pct_nhopi,
pct_not_hs_grad,pct_college,
pct_blw_pov,pct_no_h_ins,
pct_moved,pct_family_hhd,
pct_single_female,
pct_solo_res,pct_hhd_children,
avg_ppl_hhd,pct_pub_assist,
avg_hhd_income,
pct_vacant_units,
pct_multi_2_9,
pct_multi_10plus,
pct_mobile_homes,
pct_no_service,
pct_no_plumbing,pct_new_house,
avg_house_value,snap_share)
knn_results <- knn(
train=train1,
test= test1,
cl = food_train$latracts10,
k=10,
prob=TRUE
)
# adding results to test tibble
food_test <- food_test %>%
ungroup() %>%
mutate(prediction=as.numeric(knn_results == 1),
probability = attributes(knn_results)$prob,
probability = ifelse(prediction==1, probability, 1-probability))
#plotting the results
food_test %>%
ggplot(aes(x=pct_college)) +
geom_jitter(aes(y = latracts10, colour = as.factor(prediction)),
width = 0, height = 0.1) +
geom_smooth(aes(y = probability)) +
scale_y_continuous(breaks = c(0,1)) +
theme_bw()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# performance metrics of knn
conf_matrix_knn <- food_test %>%
select(latracts10,prediction) %>%
table()
conf_matrix_knn
## prediction
## latracts10 0 1
## 0 13414 5
## 1 610 1
# not possible to find false positive/false negative rate because there are no
# predicted flags in this model
food_test %>%
summarise(weighted_error = mean(C_FP*(prediction == 1 & latracts10 == 0) +
C_FN*(prediction == 0 & latracts10 == 1)))
# this all comes from the cost of incorrectly predicting a tract should not
# be flagged
# ROC curve
roc_knn <- roc(food_test %>% pull(latracts10),
food_test %>% pull(probability))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
tibble(FPR_knn = 1-roc_knn$specificities,
TPR_knn = roc_knn$sensitivities) %>%
ggplot(aes(x=FPR_knn,y=TPR_knn)) +
geom_line() +
geom_abline(slope = 1, linetype = "dashed") +
xlim(0,1)+
geom_point(x=0,y=roc_knn$sensitivities[length(roc_knn)], # FPR = 0
color='red') +
theme_bw()
## Warning: Removed 11 rows containing missing values (geom_point).

roc_knn$auc
## Area under the curve: 0.6224
num_thresholds = 100
thresholds = seq(0,1,length.out=num_thresholds)
weighted_errors = numeric(num_thresholds)
for(threshold_idx in 1:num_thresholds){
threshold = thresholds[threshold_idx]
weighted_errors[threshold_idx] =
food_test %>%
mutate(prediction = probability >= threshold) %>%
summarise(weighted_error = mean(C_FP*(prediction == 1&latracts10==0) +
C_FN*(prediction==0 &latracts10==1))) %>%
pull()
}
tibble(threshold = thresholds, weighted_error = weighted_errors) %>%
ggplot(aes(x = threshold, y = weighted_error)) +
geom_line() +
geom_vline(xintercept = c(C_FP/(C_FP + C_FN), 0.5),
linetype = "dashed", colour = "red") +
labs(x = "KNN probability threshold", y = "Average cost per transaction") +
theme_bw()

food_test_weighted <- food_test %>%
mutate(prediction = probability >= C_FP/(C_FP + C_FN))
food_test_weighted %>%
ggplot(aes(x = pct_college)) +
geom_jitter(aes(y = latracts10, colour = prediction), width = 0, height = 0.1) +
geom_smooth(aes(y = probability)) +
geom_hline(yintercept = c(0.5, C_FP/(C_FP + C_FN)),
linetype = "dashed", colour = "red") +
scale_y_continuous(breaks = c(0,1)) +
theme_bw()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

tibble(
TP=sum(as.numeric(food_test_weighted$prediction)==1&food_test_weighted$latracts10==1),
FP=sum(as.numeric(food_test_weighted$prediction)==1&food_test_weighted$latracts10==0),
FN=sum(as.numeric(food_test_weighted$prediction)==0&food_test_weighted$latracts10==1),
TN=sum(as.numeric(food_test_weighted$prediction)==0&food_test_weighted$latracts10==0))
Ridge and Lasso Regressions
source('/Users/shyankoul/Documents/Stat-471/stat-471-fall-2021/functions/plot_glmnet.R')
source('/Users/shyankoul/Documents/Stat-471/stat-471-fall-2021/functions/cross_validate_spline.R')
train2 <- train1 %>% ungroup() %>%
mutate(latracts10=food_train$latracts10)
ridge_fit <- cv.glmnet(latracts10 ~ .,
alpha=0,
nfolds=10,
family='binomial',
type.measure='class',
data=train2)
plot(ridge_fit)

plot_glmnet(ridge_fit,train2,features_to_plot=10)

probabilities <- predict(ridge_fit,
newdata=test1,
s='lambda.1se',
type='class') %>%
as.numeric()
table(food_test$latracts10,probabilities)
## probabilities
## 0 1
## 0 13390 29
## 1 555 56
lasso_fit <- cv.glmnet(latracts10 ~ .,
alpha=1,
nfolds=10,
family='binomial',
type.measure='class',
data=train2)
plot(lasso_fit)

plot_glmnet(lasso_fit,train2,features_to_plot=10)

prob_lasso <- predict(lasso_fit,
newdata=test1,
s='lambda.1se',
type='class') %>%
as.numeric()
table(food_test$latracts10,prob_lasso)
## prob_lasso
## 0 1
## 0 13335 84
## 1 431 180
Decision Tree Models
rf_fit3 <- randomForest(as.factor(as.character(latracts10)) ~ .,
mtry=3,
data=train2)
rf_fit9 <- randomForest(as.factor(as.character(latracts10)) ~ .,
mtry=9,
data=train2)
rf_fit19 <- randomForest(as.factor(as.character(latracts10)) ~ .,
mtry=19,
data=train2)
rf_fit38 <- randomForest(as.factor(as.character(latracts10)) ~ .,
mtry=38,
data=train2)
oob_errors = bind_rows(
tibble(ntree = 1:500, oob_err = rf_fit3$err.rate[,1], m = 3),
tibble(ntree = 1:500, oob_err = rf_fit9$err.rate[,1], m = 9),
tibble(ntree = 1:500, oob_err = rf_fit19$err.rate[,1], m = 19),
tibble(ntree = 1:500, oob_err = rf_fit38$err.rate[,1], m = 38)
)
oob_errors
oob_errors %>%
ggplot(aes(x = ntree, y = oob_err, colour = factor(m))) +
geom_line() + theme_bw()

head(rf_fit9$importance,5)
## MeanDecreaseGini
## pct_urban 98.75713
## pct_urbanized 215.11152
## pct_rural 610.38799
## pct_male 166.14073
## pct_under_5 102.21102
varImpPlot(rf_fit9)

rf_predictions <- as.numeric(as.character(predict(rf_fit9,newdata=food_test)))
mean((rf_predictions-food_test$latracts10)^2)
## [1] 0.03442623
gbm_fit1 = gbm(latracts10 ~ .,
distribution = "bernoulli",
n.trees = 300,
interaction.depth = 1,
shrinkage = 0.1,
cv.folds = 5,
data = train2)
gbm_fit2 = gbm(latracts10 ~ .,
distribution = "bernoulli",
n.trees = 300,
interaction.depth = 2,
shrinkage = 0.1,
cv.folds = 5,
data = train2)
gbm_fit3 = gbm(latracts10 ~ .,
distribution = "bernoulli",
n.trees = 300,
interaction.depth = 3,
shrinkage = 0.1,
cv.folds = 5,
data = train2)
gbm_fit5 = gbm(latracts10 ~ .,
distribution = "bernoulli",
n.trees = 300,
interaction.depth = 5,
shrinkage = 0.1,
cv.folds = 5,
data = train2)
ntrees <- 300
cv_errors_boost <- bind_rows(
tibble(ntree = 1:ntrees, cv_err = gbm_fit1$cv.error, depth = 1),
tibble(ntree = 1:ntrees, cv_err = gbm_fit2$cv.error, depth = 2),
tibble(ntree = 1:ntrees, cv_err = gbm_fit3$cv.error, depth = 3),
tibble(ntree = 1:ntrees, cv_err = gbm_fit5$cv.error, depth = 5))
cv_errors_boost %>%
ggplot(aes(x = ntree, y = cv_err, colour = factor(depth))) +
geom_line() + theme_bw()

optimal_num_trees = gbm.perf(gbm_fit5, plot.it = FALSE)
summary(gbm_fit3,n.trees=optimal_num_trees,plotit=FALSE)
plot(gbm_fit5, i.var = "pct_rural", n.trees = optimal_num_trees, type = "response")

plot(gbm_fit3, i.var = "pct_no_plumbing", n.trees = optimal_num_trees, type = "response")

corrplot.mixed(cor(train1),
lower = "number",
upper = "circle",
tl.col = "black")
